The following script is used to analyse the dataset containing phenotypic data (max.specific growth rate, lag phase, biomass and ethanol yield and cell dry weight) of 24 Saccharomyces cerevisiae strains cultivated in 29 conditions (triplicates).
Load libraries
library(tidyverse)
library(ggplot2)
library(ggpubr)
library(ggsci)
library(RColorBrewer)
library(ggridges)
library(stats)
library(gghighlight)
library(ggbeeswarm)
library(uwot)
library(ggpubr)
library(GGally)
Load data
df_strain_long <- readRDS("~/Library/CloudStorage/OneDrive-Chalmers/PhD/Papers/MSB/Github/scripts/df_strain_long_tr.rds")
Visualize data for each strain and phenotype
ggplot(df_strain_long, aes(x=strain, y=value, order=strain_type)) +
geom_violin(alpha=0.5, color="gray")+
geom_point(aes(colour = factor(group), alpha=0.9))+
# geom_boxplot()+
facet_wrap(~phenotypes, ncol=1, strip.position = "left", scales ="free_y")+
scale_color_manual(values = c("acid" = "#00AFBB", "alcohol" = "#3581d8", "hexose" = "#fba465", "pentose" = "#FC4E07", "salt" = "grey", "aldehyde" = "#f2c85b"))+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
## Warning: Removed 225 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 225 rows containing missing values (`geom_point()`).
Visualize only the means for each group of conditions and for each strain
df_strain <- df_strain_long %>% pivot_wider(names_from = phenotypes,values_from = value)
#calculating different means based on conditions, strains, groups and replicates
#datasets for plotting
ggdot_mean <- df_strain %>%
gather(key="phenotypes", value="value", c("umax", "lag", "Yx", "Yp", "CDW")) %>%
group_by(strain, phenotypes, group ) %>%
mutate(mean=mean(value, na.rm=TRUE))
head(ggdot_mean)
| Sample | strain | strain_type | condition | replicate | group | phenotypes | value | mean |
|---|---|---|---|---|---|---|---|---|
| Glu_65_A1_CENPK7D | CENPK7D | lab | Glu_65 | 1 | hexose | umax | 0.1443125 | 0.1085969 |
| Glu_65_A2_CENPK7D | CENPK7D | lab | Glu_65 | 2 | hexose | umax | 0.1451979 | 0.1085969 |
| Glu_65_A3_CENPK7D | CENPK7D | lab | Glu_65 | 3 | hexose | umax | 0.1462409 | 0.1085969 |
| Glu_30_A4_CENPK7D | CENPK7D | lab | Glu_30 | 1 | hexose | umax | 0.1469902 | 0.1085969 |
| Glu_30_A5_CENPK7D | CENPK7D | lab | Glu_30 | 2 | hexose | umax | 0.1412626 | 0.1085969 |
| Glu_30_A6_CENPK7D | CENPK7D | lab | Glu_30 | 3 | hexose | umax | 0.1430554 | 0.1085969 |
ggdot_mean2 <- ggdot_mean %>%
group_by(strain, phenotypes ) %>%
mutate(mean=mean(value, na.rm=TRUE)) %>%
filter(condition=="Glu_30", replicate==1)
head(ggdot_mean2)
| Sample | strain | strain_type | condition | replicate | group | phenotypes | value | mean |
|---|---|---|---|---|---|---|---|---|
| Glu_30_A4_CENPK7D | CENPK7D | lab | Glu_30 | 1 | hexose | umax | 0.1469902 | 0.0939345 |
| Glu_30_A4_ETRED | ETRED | ind | Glu_30 | 1 | hexose | umax | 0.1475120 | 0.1205679 |
| Glu_30_A4_LBCM1001 | LBCM1001 | LBCM | Glu_30 | 1 | hexose | umax | 0.1484369 | 0.0795548 |
| Glu_30_A4_LBCM1003 | LBCM1003 | LBCM | Glu_30 | 1 | hexose | umax | 0.1524228 | 0.1017694 |
| Glu_30_A4_LBCM1008 | LBCM1008 | LBCM | Glu_30 | 1 | hexose | umax | 0.1302099 | 0.0853253 |
| Glu_30_A4_LBCM1013 | LBCM1013 | LBCM | Glu_30 | 1 | hexose | umax | 0.1566515 | 0.0970893 |
#calculating means to compare datapoints
mean_phenotype_group <- df_strain %>%
gather(key="phenotypes", value="value", c("umax", "lag", "Yx", "Yp", "CDW")) %>%
group_by(phenotypes, group ) %>%
dplyr::summarise(mean_value = mean(value, na.rm=TRUE))
## `summarise()` has grouped output by 'phenotypes'. You can override using the
## `.groups` argument.
head(mean_phenotype_group)
| phenotypes | group | mean_value |
|---|---|---|
| CDW | acid | 1.4167610 |
| CDW | alcohol | 0.5014860 |
| CDW | aldehyde | 0.9938367 |
| CDW | hexose | 2.5982082 |
| CDW | pentose | 1.0163559 |
| CDW | salt | 1.4869071 |
phenotype_mean <- ggdot_mean %>%
group_by(phenotypes ) %>%
mutate(mean=mean(value, na.rm=TRUE)) %>%
filter(strain=="CENPK7D",condition=="Glu_30", replicate==1)
head(phenotype_mean)
| Sample | strain | strain_type | condition | replicate | group | phenotypes | value | mean |
|---|---|---|---|---|---|---|---|---|
| Glu_30_A4_CENPK7D | CENPK7D | lab | Glu_30 | 1 | hexose | umax | 0.1469902 | 0.0964515 |
| Glu_30_A4_CENPK7D | CENPK7D | lab | Glu_30 | 1 | hexose | lag | 6.4992669 | 7.8736138 |
| Glu_30_A4_CENPK7D | CENPK7D | lab | Glu_30 | 1 | hexose | Yx | 0.1183531 | 0.1092942 |
| Glu_30_A4_CENPK7D | CENPK7D | lab | Glu_30 | 1 | hexose | Yp | 0.2602114 | 0.0670376 |
| Glu_30_A4_CENPK7D | CENPK7D | lab | Glu_30 | 1 | hexose | CDW | 3.4974000 | 1.5002589 |
condition_mean <- ggdot_mean %>%
group_by(phenotypes, condition) %>%
mutate(mean=mean(value, na.rm=TRUE))%>%
filter(replicate==1, strain=="CENPK7D")
head(condition_mean)
| Sample | strain | strain_type | condition | replicate | group | phenotypes | value | mean |
|---|---|---|---|---|---|---|---|---|
| Glu_65_A1_CENPK7D | CENPK7D | lab | Glu_65 | 1 | hexose | umax | 0.1443125 | 0.1425594 |
| Glu_30_A4_CENPK7D | CENPK7D | lab | Glu_30 | 1 | hexose | umax | 0.1469902 | 0.1382374 |
| Xyl_36_A7_CENPK7D | CENPK7D | lab | Xyl_36 | 1 | pentose | umax | 0.0994732 | 0.0975475 |
| Xyl_16_A10_CENPK7D | CENPK7D | lab | Xyl_16 | 1 | pentose | umax | 0.1049668 | 0.1051068 |
| Gal_4.5_B1_CENPK7D | CENPK7D | lab | Gal_4.5 | 1 | hexose | umax | 0.0940127 | 0.1093622 |
| Gal_2_B4_CENPK7D | CENPK7D | lab | Gal_2 | 1 | hexose | umax | 0.0899561 | 0.0947701 |
p <- ggdotchart(ggdot_mean, x = "strain", y = "mean",
group = "strain_type",
color = "group", # Color by groups
palette = c("#00AFBB","#3581d8","#f2c85b", "#fba465", "#FC4E07", "grey"), # Custom color palette # Add segments from y = 0 to dots
add.params = list(color = "lightgray", size = 2), # Change segment color and size
dot.size = 2, # Large dot size
ggtheme = theme_light() # ggplot2 theme
)+
geom_hline(yintercept = 0, linetype = 2, color = "lightgray") +
facet_grid(vars(phenotypes), scales = "free_y")+
theme(
text = element_text(size = 20),
axis.text = element_text(size = 20),
axis.title = element_text(size = 20, face = "bold"),
strip.background = element_rect(fill = "grey"),
strip.text.x = element_text(size = 20, color = "black", face = "bold"),
strip.text.y = element_text(size = 20, color = "black", face = "bold")
)
p2 <- p +
geom_point(data = ggdot_mean2,aes(x=strain, y=mean), shape=95, size= 7)+
facet_grid(vars(phenotypes), scales = "free_y")
p2
#ggsave("phenotype_mean.svg", width = 500, height = 300, unit = "mm", dpi = 300, path="~/Library/CloudStorage/OneDrive-Chalmers/PhD/Papers/MSB/Path_to_submission/Final Figures/R_figures")
Create metadata
df_meta <-
df_strain_long %>%
select(Sample,strain,strain_type,condition,replicate,group) %>%
distinct()
Let’s begin by log-normalizing the data
df_strain_long %>%
mutate(value_log = log10(value+1e-4)) %>% View()
pca_data <-
df_strain_long %>%
mutate(value_log = log10(value+1e-4)) %>%
select(-value) %>%
pivot_wider(names_from = phenotypes,values_from = value_log)
head(pca_data)
| Sample | strain | strain_type | condition | replicate | group | CDW | Yx | Yp | umax | lag |
|---|---|---|---|---|---|---|---|---|---|---|
| Glu_65_A1_CENPK7D | CENPK7D | lab | Glu_65 | 1 | hexose | 0.7409545 | -1.0684447 | -1.1284004 | -0.8403951 | 0.8330939 |
| Glu_65_A2_CENPK7D | CENPK7D | lab | Glu_65 | 2 | hexose | 0.7524249 | -1.0599972 | -1.0014028 | -0.8377407 | 0.8323042 |
| Glu_65_A3_CENPK7D | CENPK7D | lab | Glu_65 | 3 | hexose | 0.7259035 | -1.0849855 | -0.9420420 | -0.8346342 | 0.8473676 |
| Glu_30_A4_CENPK7D | CENPK7D | lab | Glu_30 | 1 | hexose | 0.5437577 | -0.9264536 | -0.5845068 | -0.8324163 | 0.8128711 |
| Glu_30_A5_CENPK7D | CENPK7D | lab | Glu_30 | 2 | hexose | 0.5319513 | -0.9365973 | -0.6073209 | -0.8496655 | 0.8099530 |
| Glu_30_A6_CENPK7D | CENPK7D | lab | Glu_30 | 3 | hexose | 0.5267914 | -0.9483274 | -0.4836635 | -0.8441922 | 0.7436396 |
Make data suitable for prcomp function
df_pca <-
pca_data %>%
select(Sample,CDW,Yx,Yp,umax,lag) %>%
as.data.frame()
rownames(df_pca) <- df_pca$Sample
df_pca <-
df_pca %>%
select(-Sample)
Run PCA
pca1 <- prcomp(na.omit(df_pca),center = T,scale. = F)
pca1$x %>% head()
## PC1 PC2 PC3 PC4 PC5
## Glu_65_A1_CENPK7D -0.4955068 0.19883285 0.5328749 -0.14936278 -0.10156267
## Glu_65_A2_CENPK7D -0.6015676 0.13087592 0.5505046 -0.16179786 -0.10411027
## Glu_65_A3_CENPK7D -0.6254141 0.06681808 0.5520449 -0.17399608 -0.08783624
## Glu_30_A4_CENPK7D -0.9447666 -0.15021062 0.3712173 -0.12596684 -0.03088934
## Glu_30_A5_CENPK7D -0.9163720 -0.15087262 0.3607644 -0.12263541 -0.04279616
## Glu_30_A6_CENPK7D -1.0112277 -0.23511007 0.3865725 -0.06899068 -0.05846269
Store PCA results and merge with metadata
df_pcares <-
pca1$x %>%
as_tibble(rownames = "Sample") %>%
select(Sample,PC1,PC2)
head(df_pcares)
| Sample | PC1 | PC2 |
|---|---|---|
| Glu_65_A1_CENPK7D | -0.4955068 | 0.1988329 |
| Glu_65_A2_CENPK7D | -0.6015676 | 0.1308759 |
| Glu_65_A3_CENPK7D | -0.6254141 | 0.0668181 |
| Glu_30_A4_CENPK7D | -0.9447666 | -0.1502106 |
| Glu_30_A5_CENPK7D | -0.9163720 | -0.1508726 |
| Glu_30_A6_CENPK7D | -1.0112277 | -0.2351101 |
df_pca_res <-
df_pcares %>%
full_join(df_meta,by = "Sample")
df_pca_res %>%
ggplot(aes(x=PC1,y=PC2,color=group)) +
geom_point() +
stat_ellipse() +
theme_bw() +
theme(aspect.ratio = 1)
## Warning: Removed 225 rows containing non-finite values (`stat_ellipse()`).
## Warning: Removed 225 rows containing missing values (`geom_point()`).
df_pca_res %>%
ggplot(aes(x=PC1,y=PC2,color=strain_type)) +
geom_point() +
stat_ellipse() +
theme_bw() +
theme(aspect.ratio = 1)
## Warning: Removed 225 rows containing non-finite values (`stat_ellipse()`).
## Removed 225 rows containing missing values (`geom_point()`).
Check outliers
outlier_samples <-
df_pca_res %>%
filter(PC1>5 & PC2<0) %>%
pull(Sample)
df_strain_long %>%
filter(Sample %in% outlier_samples) %>%
pivot_wider(names_from = phenotypes,values_from = value) %>% head()
| Sample | strain | strain_type | condition | replicate | group |
|---|
Run UMAP on log-normalized data and store output
set.seed(1980)
umap1 <- umap(na.omit(df_pca),n_components = 3)
df_umap_res <-
umap1 %>%
as_tibble(rownames = "Sample") %>%
rename("UMAP1" = "V1") %>%
rename("UMAP2" = "V2") %>%
rename("UMAP3" = "V3") %>%
full_join(df_meta, by = "Sample")
## Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
## `.name_repair` is omitted as of tibble 2.0.0.
## ℹ Using compatibility `.name_repair`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Plot UMAP
df_umap_res %>%
ggplot(aes(x=UMAP1,y=UMAP2,color=group)) +
geom_point() +
stat_ellipse() +
theme_bw() +
theme(aspect.ratio = 1)
## Warning: Removed 225 rows containing non-finite values (`stat_ellipse()`).
## Warning: Removed 225 rows containing missing values (`geom_point()`).
plotly::plot_ly(df_umap_res, x = ~UMAP1, y = ~UMAP2, z = ~UMAP3, color = df_umap_res$group, colors = c("#00AFBB","#3581d8","#f2c85b", "#fba465", "#FC4E07", "grey"))
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: Ignoring 225 observations
df_umap_res %>%
ggplot(aes(x=UMAP1,y=UMAP2,color=strain_type)) +
geom_point() +
stat_ellipse() +
theme_void() +
theme(aspect.ratio = 1) +
ggtitle("UMAP")
## Warning: Removed 225 rows containing non-finite values (`stat_ellipse()`).
## Removed 225 rows containing missing values (`geom_point()`).
By group
df_strain_long %>%
mutate(sugar = group) %>%
select(-group) %>%
ggplot(aes(x=sugar,y=value, fill = sugar)) +
geom_jitter()
## Warning: Removed 225 rows containing missing values (`geom_point()`).
df_strain_long %>%
ggplot(aes(x=group,y=value, fill = group)) +
geom_violin(draw_quantiles = 0.5) +
scale_fill_manual(values=c("#00AFBB","#3581d8","#f2c85b", "#fba465", "#FC4E07", "grey"))+
facet_grid(vars(phenotypes), scales = "free_y")+
theme_light()+
theme(
text = element_text(size = 20),
axis.text = element_text(size = 20),
axis.title = element_text(size = 20, face = "bold"),
strip.background = element_rect(fill = "grey"),
strip.text.x = element_text(size = 20, color = "black", face = "bold"),
strip.text.y = element_text(size = 20, color = "black", face = "bold")
)
## Warning: Removed 225 rows containing non-finite values (`stat_ydensity()`).
#ggsave("groups_phenotype_distribution.svg", width = 500, height = 300, unit = "mm", dpi = 300, path="~/Library/CloudStorage/OneDrive-Chalmers/PhD/Papers/MSB/Path_to_submission/Final Figures/R_figures")
#Evaluate if the groups are statistically different
cm <- compare_means(value ~ group, data = df_strain_long, ref.group = "hexose",
method = "wilcox.test", group.by = "phenotypes")
By condition
df_strain_long %>%
mutate(condition = factor(condition,levels = c("Glu_20",
"Glu_30",
"Glu_65",
"Xyl_16",
"Xyl_36",
"Gal_2",
"Gal_4.5",
"Ara_2",
"Ara_4",
"Man_10",
"Man_16",
"EtOH_45",
"EtOH_90",
"NaCl_25",
"NaCl_80",
"FrAcid_1",
"FrAcid_3.5",
"AcAcid_4.5",
"AcAcid_6",
"LacAcid_2",
"LacAcid_7",
"LevAcid_2.5",
"LevAcid_5",
"HMF_0.5",
"HMF_6",
"Furf_1",
"Furf_3",
"Van_0.5",
"Van_2"),ordered = T)) %>%
ggplot(aes(x=condition,y=value, fill = condition)) +
geom_violin() +
facet_wrap(vars(phenotypes),ncol = 1,scales = "free_y") +
theme_bw() +
theme(aspect.ratio = 0.2,
axis.text.x = element_text(angle = 45,hjust = 1)) +
ggtitle("All phenotypes",subtitle = "Grouped by strain")+
stat_compare_means(ref.group = "Glu_20",label = "p.signif",hide.ns = T)
## Warning: Removed 225 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 225 rows containing non-finite values
## (`stat_compare_means()`).
#run correlations for all the combinations of phenotypes
ggpairs(df_strain, columns = c(7:11),ggplot2::aes(colour=group), palette())
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 225 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 225 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 225 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 225 rows containing missing values
## Warning: Removed 225 rows containing missing values (`geom_point()`).
## Removed 225 rows containing missing values (`geom_point()`).
## Removed 225 rows containing missing values (`geom_point()`).
## Removed 225 rows containing missing values (`geom_point()`).
## Warning: Removed 225 rows containing non-finite values (`stat_density()`).
#visualize specific correlations with only two phenotypes per time
#ggscatter by phenotype
ggscatter(df_strain, x = 'umax', y = 'CDW', color = 'group', add = "reg.line", conf.int = FALSE, cor.method = "spearman", size=0.5, palette = c("#00AFBB","#3581d8","#f2c85b", "#fba465", "#FC4E07", "grey"))+
stat_cor(aes(color = group)) +
yscale("log10", .format = TRUE)+
# xscale("log10", .format = TRUE)+
stat_cor(method = "spearman")+
theme_light()+
theme(
text = element_text(size = 7),
axis.text.x = element_text(size = 7, angle=90, vjust = 0.5,hjust = 1),
axis.text.y = element_text(size = 7),
axis.title = element_text(size = 7, face = "bold"),
strip.background = element_rect(fill = "grey"),
strip.text.x = element_text(size = 7, color = "black", face = "bold"),
strip.text.y = element_text(size = 7, color = "black", face = "bold"))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Transformation introduced infinite values in continuous y-axis
## Warning: Removed 225 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 225 rows containing non-finite values (`stat_cor()`).
## Removed 225 rows containing non-finite values (`stat_cor()`).
#ggsave("umax_CDW_corr.svg", width = 150, height = 110, unit = "mm", dpi = 300, path="~/Library/CloudStorage/OneDrive-Chalmers/PhD/Papers/MSB/Path_to_submission/Final Figures/R_figures")